from fbri.private.sql.query import execute
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import math
database = 'fbri_prod_private'
breakdown_table = 'erc_condor_url_breakdowns_dp_clean_partitioned_v2'
attributes_table = 'erc_condor_url_attributes_dp_final_v2'
#MAKE SURE THAT THIS IS V2
from pathlib import Path

# assume that this script is being run from the root of the project directory
PROJECT_DIR = Path("/home/jovyan/ss1_curation_fb/")

# ### This notebook is how we get all Condor data relevant to the curation bubbles project.
# 
# General idea:
# What are all the domains available in Condor?  
# How many of them clear some arbitrary share threshold? (aka have more than a million shares?)  
# (Note that we are including shares from users with NAN political ideology here)
# 
# Get all domains with more than 10 million shares.
# Merge all of their engagement data.
# 
# Then, get all domains with more than milllion shares.
# Merge all of their engagement data.
# 
# Then, merge the two resulting datasets together.

# ## 1) Identify all domains in Condor, and each domain's total number of shares.

# #across all of US Facebook, this gets all domains and how many times they've been shared (broken by ideology for no reason)
# #should take about 256 seconds to query, not long to write
print("1) Identify domains")

if not (PROJECT_DIR / "data/all_domains_shares.tsv").exists(): 
    sql = f"""
        SELECT 
            a.parent_domain, 
            SUM(b.shares)
        FROM 
            {database}.{breakdown_table} b
        JOIN
            {database}.{attributes_table} a
        ON 
            a.url_rid = b.url_rid
        WHERE 
            b.c = 'US'
            AND a.public_shares_top_country = 'US' 
            AND date_format(a.first_post_time, '%Y-%m') = b.year_month   
            AND CAST(date_format(a.first_post_time, '%Y') AS INT) BETWEEN 2017 AND 2019
        GROUP BY 
            a.parent_domain
    """

    result = execute(sql, PROJECT_DIR / "data/all_domains_shares.tsv") 

share_df = pd.read_csv(PROJECT_DIR / "data/all_domains_shares.tsv", sep = '\t')
share_df.columns = ['parent_domain', 'total_shares']

target_domains_tenmillion = set((share_df[share_df['total_shares'] > 10_000_000]['parent_domain']))
target_domains_million = set((share_df[share_df['total_shares'] > 1_000_000]['parent_domain']))
target_domains_million = tuple(target_domains_million - target_domains_tenmillion)
target_domains_million = tuple([i for i in target_domains_million if i==i]) #nan removal


# ## 2) Queries:

# Each query can only take approx an hour to both execute the query and write the result to disk. Therefore, we can't have one big query, we have to have several small queries and merge them together. 
# 
# There are currently two sets of queries:
# 
# 1) 120 domains (above ten million shares)
# 2) 1012 domains (above 1 million shares)
# 
# Sarah wrote this function to easily slice a list into smaller sublists.
# 
# Note, this was not done for the first two sets of queries, but I would recommend sorting the domain list before querying (to fix the random ordering of slices (because Python sets have arbitrary ordering))

def get_slices(rids, max_count = 10):

    iters = len(rids) // max_count

    start = [i*max_count for i in range(iters)]
    end = [(i+1)*max_count for i in range(iters)]

    start.append(end[-1])
    end.append(len(rids)+1)

    tuple_rids = tuple(rids)
    slices = [tuple_rids[start[i]:end[i]] for i in range(iters+1)]
    
    return slices

# ### 2A) First set of queries

print("2A) query large domains")
slices = get_slices(target_domains_tenmillion, max_count = 10)


for i, domain_slice in enumerate(slices):
    fname = PROJECT_DIR / f"data/engagement_data{i}.tsv"
    if i >= 0 and domain_slice and not fname.exists():

        print(fname.resolve())
        sql = f"""
            SELECT a.parent_domain, 
                a.clean_url, 
                b.political_page_affinity, 
                SUM(b.views) AS views, 
                SUM(b.clicks) AS clicks, 
                SUM(b.shares) AS shares, 
                (SUM(b.likes) + SUM(b.loves) + SUM(b.hahas) + SUM(b.wows) + SUM(b.sorrys) + SUM(b.angers)) AS reacts

            FROM {database}.{breakdown_table} b
            JOIN {database}.{attributes_table} a
            ON (  a.url_rid = b.url_rid
            )
            WHERE 
                b.c = 'US'
                AND a.public_shares_top_country = 'US' 
                AND a.parent_domain IN {domain_slice}
                AND date_format(a.first_post_time, '%Y-%m') = b.year_month

            GROUP BY a.parent_domain, a.clean_url, b.political_page_affinity

        """

        result = execute(sql, PROJECT_DIR / f"data/engagement_data{i}.tsv") 

        
dfs = []
for fname in PROJECT_DIR.glob("data/engagement_data*.tsv"):
    df = pd.read_csv(fname, sep = '\t')
    dfs.append(df)
    
ten_mil_df = pd.concat(dfs, axis = 0)
ten_mil_df.reset_index(inplace = True)
ten_mil_df.to_feather(PROJECT_DIR / 'data/ten_million_domains_engagement.feather')   
ten_mil_df = pd.read_feather(PROJECT_DIR / 'data/ten_million_domains_engagement.feather')
print(f"found {ten_mil_df['parent_domain'].nunique()} large domains")

# ### 2B) Second set of queries
print(f"2B) query medium domains")
slices = get_slices(target_domains_million, max_count = 10)

for i, domain_slice in enumerate(slices):
    fname = PROJECT_DIR / f"data/million/million_engagement_data_{i}.tsv"
    if i >= 0 and not fname.exists():
        print(fname.resolve())
        sql = f"""
            SELECT a.parent_domain, 
                a.clean_url, 
                b.political_page_affinity, 
                SUM(b.views) AS views, 
                SUM(b.clicks) AS clicks, 
                SUM(b.shares) AS shares, 
                (SUM(b.likes) + SUM(b.loves) + SUM(b.hahas) + SUM(b.wows) + SUM(b.sorrys) + SUM(b.angers)) AS reacts

            FROM {database}.{breakdown_table} b
            JOIN {database}.{attributes_table} a
            ON (  a.url_rid = b.url_rid  )
            WHERE 
                b.c = 'US'
                AND a.public_shares_top_country = 'US' 
                AND a.parent_domain IN {domain_slice}
                AND date_format(a.first_post_time, '%Y-%m') = b.year_month

            GROUP BY a.parent_domain, a.clean_url, b.political_page_affinity

        """

        result = execute(sql, PROJECT_DIR / f"data/million/million_engagement_data_{i}.tsv") 

dfs = []
for fname in PROJECT_DIR.glob("data/million/million_engagement_data*.tsv"):
    df = pd.read_csv(fname, sep = '\t')
    dfs.append(df)
    
mil_df = pd.concat(dfs, axis = 0)
mil_df.reset_index(inplace = True)
mil_df.to_feather(PROJECT_DIR / 'data/million_domains_engagement.feather')
mil_df = pd.read_feather(PROJECT_DIR / 'data/million_domains_engagement.feather')
print(f"found {mil_df['parent_domain'].nunique()} medium domains")

print("3) combine data for output")
final_df = pd.concat([mil_df, ten_mil_df])

print(f"{final_df['parent_domain'].nunique()} total domains")

# ### 3A) remove duplicates

grouped  = final_df.groupby(['parent_domain', 'clean_url']).count()
duplicate_urls = grouped[grouped['index'] > 6]
duplicate_urls = set(duplicate_urls.reset_index()['clean_url'])

dup_df =final_df[final_df['clean_url'].isin(duplicate_urls)]
final_df = final_df.drop_duplicates(subset = ['parent_domain', 'clean_url', 'views', 'clicks', 'shares', 'reacts']).copy()

final_df = final_df[~final_df['clean_url'].isna()].copy()



final_df.reset_index().to_feather(PROJECT_DIR / 'data/all_ss1_domains_engagement.feather')

